{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5e6af81e",
   "metadata": {},
   "source": [
    "### Odds ratio from gProfiler results "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5db130cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from pathlib import Path \n",
    "import numpy as np\n",
    "\n",
    "from scipy.stats import fisher_exact"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "81d9950d",
   "metadata": {},
   "outputs": [],
   "source": [
    "wkdir=\"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "gprofiler_results_path = wkdir_path.joinpath(\"3_misexp_genes/gprofiler_enrichment/gprofiler_hp_enrich_misexp_8650genes_protein_coding.csv\")\n",
    "gprofiler_results_df = pd.read_csv(gprofiler_results_path, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "65a119b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sum_nested_list(nested_list):\n",
    "    total = 0\n",
    "    for item in nested_list:\n",
    "        if isinstance(item, list):\n",
    "            total += sum_nested_list(item)\n",
    "        else:\n",
    "            total += item \n",
    "    return total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "57964438",
   "metadata": {},
   "outputs": [],
   "source": [
    "enrich_results_dict = {}\n",
    "for index, row in gprofiler_results_df.iterrows(): \n",
    "    term_name, intersect, term, query, domain = row[\"term_name\"], row[\"intersection_size\"], row[\"term_size\"], row[\"query_size\"], row[\"effective_domain_size\"]\n",
    "    misexp_overlap = intersect\n",
    "    misexp_nonoverlap = query - intersect\n",
    "    nonmisexp_overlap = term - intersect\n",
    "    nonmisexp_nonoverlap = domain - query - nonmisexp_overlap\n",
    "    conting_mtx = [[misexp_overlap, misexp_nonoverlap], [nonmisexp_overlap, nonmisexp_nonoverlap]]\n",
    "    odds_ratio, p_value = fisher_exact(conting_mtx, alternative=\"two-sided\")\n",
    "    total = sum_nested_list(conting_mtx)\n",
    "    if total != domain: \n",
    "        raise ValueError(print(f\"Total domain size not equal: {total}, {domain}\"))\n",
    "    enrich_results_dict[index] = [term_name, odds_ratio, p_value]\n",
    "                   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "fb556a4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "enrich_results_df = pd.DataFrame.from_dict(enrich_results_dict, orient=\"index\", columns=[\"term_name\", \"odds_ratio\", \"pval\"])\n",
    "enrich_results_df[\"log_odds\"] = np.log(enrich_results_df.odds_ratio)\n",
    "# merge with results \n",
    "gprofiler_enrich_results_df = pd.merge(gprofiler_results_df, enrich_results_df, on=\"term_name\", how=\"inner\")\n",
    "# write to file\n",
    "gprofiler_enrich_results_path = wkdir_path.joinpath(\"3_misexp_genes/gprofiler_enrichment/gprofiler_hp_enrich_misexp_8650genes_protein_coding_log_odds.csv\")\n",
    "gprofiler_enrich_results_df.to_csv(gprofiler_enrich_results_path, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7eb0e3c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
